import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import requests
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn import datasets, linear_model
import statsmodels.api as sm
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot
from plotly.subplots import make_subplots
import math
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.manifold import MDS
from sklearn.manifold import Isomap
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_squared_log_error, mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.datasets import make_biclusters
from sklearn.cluster import SpectralCoclustering
from sklearn.metrics import consensus_score
import scipy.linalg as la
COVID-19 is a disease caused by SARS-CoV-2. It takes millions of people lives and affect people's everyday living. Generally, up to the information taken from 16/1/2021, European and North American countries are most affected by this pandemic. Most government have policies to alleviate the exponentially rising COVID-19 confirmed cases such as lockdown and social distancing. Most governments also have increase the numbers of medical supply to cope with the skyrocket amount of patients. Besides govenment policy, the objective of this project is to investigate any other data that would predict the infected rate of each country The first part of the project is an Exploratory Data Analysis on the influence of geographical, economical, and societal factors outside of government policies, on the infected rate and the death rate of each country. The second part is the predictive modelling using linear regression. The predictive modelling is based on the evidence of the first part of the project.
The Dataset included in this project are the following:
Main Datasets:
Auxilary Datasets :
The World Bank has provided an DataBank available online for download. I select two out of 49 datasets available online, which are Health and Nutrition and population statistics and World Development Indicators.
healthData = pd.read_csv("data/Health_Nutrition_and_Population.csv")
worldDevelopmentData = pd.read_csv("data/World_Development_Indicators.csv")
worldDevelopmentData.head()
Since the data is stored in country-metrics format, I would need to convert it to country format, in which the row would be the country, and the columns would be the metrics name.
At the same time, since some data is missing in certain year, I would fill the missing value with the latest data available (e.g. metric is missing in 2019 and 2020, I would pick the data in 2018 to replace the missing value)
healthData.columns = ['Metrics', 'Metrics_Code', 'Country_Name', 'Country_Code', '2016', '2017', '2018', '2019', '2020']
worldDevelopmentData.columns = ['Country_Name', 'Country_Code', 'Metrics', 'Metrics_Code',
'2016', '2017', '2018', '2019', '2020']
healthData = healthData[:-5]
worldDevelopmentData = worldDevelopmentData.iloc[:-5]
healthData = healthData.replace('..', np.nan)
healthData['2016'] = healthData['2016'].apply(lambda x: float(x))
healthData['2017'] = healthData['2017'].apply(lambda x: float(x))
healthData['2018'] = healthData['2018'].apply(lambda x: float(x))
healthData['2019'] = healthData['2019'].apply(lambda x: float(x))
healthData['2020'] = healthData['2020'].apply(lambda x: float(x))
health_pivot = healthData.pivot_table(values=['2016', '2017', '2018', '2019', '2020'], index=['Country_Name'],
columns=['Metrics'], dropna = False)
clean_health_df = health_pivot['2020']
years = ['2019', '2018', '2017', '2016']
for year in years:
clean_health_df = clean_health_df.fillna(health_pivot[year])
worldDevelopmentData = worldDevelopmentData.replace('..', np.nan)
worldDevelopmentData['2016'] = worldDevelopmentData['2016'].apply(lambda x: float(x))
worldDevelopmentData['2017'] = worldDevelopmentData['2017'].apply(lambda x: float(x))
worldDevelopmentData['2018'] = worldDevelopmentData['2018'].apply(lambda x: float(x))
worldDevelopmentData['2019'] = worldDevelopmentData['2019'].apply(lambda x: float(x))
worldDevelopmentData['2020'] = worldDevelopmentData['2020'].apply(lambda x: float(x))
Dev_pivot = worldDevelopmentData.pivot_table(values=['2016', '2017', '2018', '2019', '2020'], index=['Country_Name'],
columns=['Metrics'], dropna = False)
clean_dev_df = Dev_pivot['2020'].copy()
for year in years:
clean_dev_df = clean_dev_df.fillna(Dev_pivot[year])
cols_to_use =clean_health_df.columns.difference(clean_dev_df.columns)
df_world = clean_dev_df.join(clean_health_df[cols_to_use])
The data included in the JHU COVID-19 are:
The data included in the current study would be the data on 16th Jan 2021
confirmed = pd.read_csv('data/time_series_covid19_confirmed_global.csv')
cols_to_use = ['Province/State', 'Country/Region', 'Lat', 'Long', '1/16/21']
confirmed = confirmed[cols_to_use]
confirmed = confirmed.rename(columns= {'1/16/21': 'total_cases'})
confirmed['Province/State'] = confirmed['Province/State'].fillna(confirmed['Country/Region'])
confirmed = confirmed.set_index('Province/State')
death = pd.read_csv('data/time_series_covid19_deaths_global.csv')
death = death.rename(columns= {'1/16/21': 'total_deaths'})
death['Province/State'] = death['Province/State'].fillna(death['Country/Region'])
death = death[['Province/State', 'total_deaths']]
death = death.set_index('Province/State')
COVID19 = confirmed.join(death)
# List all countries that has records in world df but not in COVID-19 dataset
df_world.index.difference(COVID19.index)
There are 4 types of reason why the index is missing in the COVID-19 datasets, which are:
The first three cases would be included in this project while the fourth case would be droped from the dataset
country_rename = {'Bahamas, The': 'Bahamas', 'Brunei Darussalam': 'Brunei', 'Congo, Dem. Rep.': 'Congo (Kinshasa)',
'Congo, Rep.': 'Congo (Brazzaville)', 'Czech Republic': 'Czechia', 'Egypt, Arab Rep.': 'Egypt',
'Gambia, The':'Gambia', 'Hong Kong SAR, China': 'Hong Kong', 'Iran, Islamic Rep.': 'Iran',
'Korea, Rep.': 'Korea, South', 'Kyrgyz Republic': 'Kyrgyzstan', 'Lao PDR': 'Laos', 'Macao SAR, China': 'Macau',
'Myanmar': 'Burma', 'Russian Federation': 'Russia', 'Sint Maarten (Dutch part)': 'Sint Maarten',
'Slovak Republic': 'Slovakia', 'St. Kitts and Nevis': 'Saint Kitts and Nevis', 'St. Lucia': 'Saint Lucia',
'St. Martin (French part)': 'St Martin', 'St. Vincent and the Grenadines': 'Saint Vincent and the Grenadines',
'Syrian Arab Republic': 'Syria', 'United States': 'US', 'Venezuela, RB': 'Venezuela', 'Yemen, Rep.': 'Yemen'}
country_with_0cases = ["Kiribati", "Korea, Dem. People’s Rep.", "Tuvalu", "Tonga", "Turkmenistan", "Palau", "Nauru",
"Micronesia, Fed. Sts."]
# These countries' data are seperated by their states/ provinces
country_to_merge = ['Canada', 'China', 'Australia']
country_to_drop= ['American Samoa', 'Guam', "Northern Mariana Islands", "Puerto Rico", "Virgin Islands (U.S.)"]
df_world.rename(index = country_rename, inplace = True)
df_world.drop(index = country_to_drop, inplace = True)
# Hong Kong and Macau will be classified as seperate sample from China in this analysis
COVID19.loc['Hong Kong', 'Country/Region'] = 'Hong Kong'
COVID19.loc['Macau', 'Country/Region'] = 'Macau'
# Get all provinces in China from wikipedia
# Check if all provinces in China are captured in our our existing dataset
html = requests.get('https://en.wikipedia.org/wiki/Provinces_of_China')
soup = BeautifulSoup(html.content, 'lxml')
table = soup.select('table.sortable')
chineseProvince = pd.read_html(str(table))[0]['Province']
chineseProvince = [province.partition('[')[0] for province in chineseProvince]
chineseProvince = [province.partition(' Province')[0] for province in chineseProvince]
chineseProvince = [province.partition(' Municipality')[0] for province in chineseProvince]
chineseProvince = [province.partition(' Autonomous Region')[0] for province in chineseProvince]
chineseProvince = [province.partition(' Uyghur')[0] for province in chineseProvince]
chineseProvince = [province.partition(' Zhuang')[0] for province in chineseProvince]
chineseProvince = [province.partition(' Hui')[0] for province in chineseProvince]
chineseProvince = [province for province in chineseProvince if province not in ('Hong Kong Special Administrative Region'
, 'Macau Special Administrative Region', 'Taiwan')]
print('Are all provinces in China covered in the dataset? ' + str(set(chineseProvince).issubset(set(COVID19[COVID19['Country/Region'] == 'China'].index))))
# Combine all confirmed case in China's provinces
china_confirmed = COVID19[COVID19['Country/Region'] == 'China'].sum()
# Exclude cases from Hong Kong and Macau
china_confirmed['Lat'] = 35.8617
china_confirmed['Long'] = 104.1954
china_confirmed['Country/Region'] = 'China'
COVID19.loc['China'] = china_confirmed
# Check if all states in Australia are captured in our existing confirmed cases dataset
html = requests.get('https://en.wikipedia.org/wiki/States_and_territories_of_Australia')
soup = BeautifulSoup(html.content, 'lxml')
australianStates = pd.read_html(str(soup.select('table.sortable')[0]))[0]['State']
australianStates = australianStates.append(pd.read_html(str(soup.select('table.sortable')[1]))[0]['Territory'])
australianStates = [state for state in australianStates if state != 'Jervis Bay Territory']
print('Are all states in Australia except "Jervis Bay Territory" covered in the dataset? ' + str(set(australianStates).issubset(set(COVID19[COVID19['Country/Region'] == 'Australia'].index))))
# Combine all confirmed case in China's provinces
australia_confirmed = COVID19[COVID19['Country/Region'] == 'Australia'].sum()
# # Exclude cases from Hong Kong and Macau
australia_confirmed ['Lat'] = -25.2744
australia_confirmed ['Long'] = 133.7751
australia_confirmed ['Country/Region'] = 'Australia'
COVID19.loc['Australia'] = australia_confirmed
html = requests.get('https://en.wikipedia.org/wiki/Provinces_and_territories_of_Canada')
soup = BeautifulSoup(html.content, 'lxml')
canadaProvinces = pd.read_html(str(soup.select('table.sortable')[0]))[0]['Province']['Province']
canadaProvinces = canadaProvinces.append(pd.read_html(str(soup.select('table.sortable')[1]))[0]['Territory']['Territory'])
canadaProvinces = [province.partition('[')[0] for province in canadaProvinces]
canadaProvinces = [province for province in canadaProvinces if province not in ['Total territories', 'Total provinces']]
print('Are all provinces and territories in Canada covered in current dataset? ' + str(set(canadaProvinces).issubset(COVID19[COVID19['Country/Region'] == 'Canada'].index)))
# Exclude cases from 'Grand Princess', 'Diamond Princess', 'Repatriated Travellers'
confirmed.drop(['Grand Princess', 'Diamond Princess', 'Repatriated Travellers'])
canada_confirmed = confirmed[confirmed['Country/Region'] == 'Canada'].sum()
canada_confirmed ['Lat'] = 56.1304
canada_confirmed ['Long'] = -106.3468
canada_confirmed ['Country/Region'] = 'Canada'
COVID19.loc['Canada'] = canada_confirmed
combined_world_df = df_world.join(COVID19)
combined_world_df['infected_rate'] = combined_world_df['total_cases']/combined_world_df['Population, total']
combined_world_df['death_rate'] = combined_world_df['total_deaths']/combined_world_df['total_cases']
# Country with zero reported cases would have 0 infected rate and death rate
combined_world_df.loc[country_with_0cases, 'infected_rate'] = 0
combined_world_df.loc[country_with_0cases, 'death_rate'] = 0
countryContinent = pd.read_csv('data/countryContinent.csv', encoding='latin-1')
countryContinent.set_index('country', inplace = True)
countries_to_add = [['Channel Islands', 'Europe', 'Northern Europe'], ['Curacao', 'Americas', 'Caribbean'],
['Eswatini', 'Africa', 'Southern Africa'], ['Kosovo', 'Europe', 'Southern Europe'], ['Macau', 'Asia', "Eastern Asia"],
['British Virgin Islands', 'Americas', 'Caribbean']]
countries_to_rename = {'Brunei Darussalam': 'Brunei', 'Myanmar': 'Burma', 'Congo': 'Congo (Brazzaville)',
'Congo (Democratic Republic of the)': 'Congo (Kinshasa)', 'Côte d\'Ivoire': 'Cote d\'Ivoire',
'Czech Republic': 'Czechia', 'Iran (Islamic Republic of)': 'Iran', 'Korea (Republic of)': 'Korea, South',
'Lao People\'s Democratic Republic': 'Laos', 'Moldova (Republic of)': 'Moldova',
'Macedonia (the former Yugoslav Republic of)': 'North Macedonia', 'Russian Federation':'Russia',
'Sint Maarten (Dutch part)': 'Sint Maarten', 'Saint Martin (French part)': 'St Martin',
'Syrian Arab Republic':'Syria', 'Tanzania, United Republic of': 'Tanzania', 'United States of America': 'US',
'United Kingdom of Great Britain and Northern Ireland':'United Kingdom', 'Venezuela (Bolivarian Republic of)': 'Venezuela',
'Viet Nam': 'Vietnam', 'Palestine, State of': 'West Bank and Gaza', 'Korea (Democratic People\'s Republic of)': 'Korea, Dem. People’s Rep.',
'Micronesia (Federated States of)': 'Micronesia, Fed. Sts.'
}
countryContinent.rename(countries_to_rename, axis = 0, inplace = True)
for country in countries_to_add:
countryContinent.loc[country[0], ['continent', 'sub_region']] = country[1:]
cols_to_join = ['continent', 'sub_region']
combined_world_df = combined_world_df.join(countryContinent[cols_to_join])
html = requests.get('https://www.nationsonline.org/oneworld/country_code_list.htm')
soup = BeautifulSoup(html.content, 'lxml')
iso_code = pd.read_html(str(soup.select('table')))[0]
iso_code.columns = ['0', 'country', 'iso2', 'iso3', 'UNcode']
iso_code = iso_code.drop(['0', 'iso2', 'UNcode'], axis = 1)
iso_code.set_index('country', inplace = True)
combined_world_df.index.difference(iso_code.index)
country_to_rename = {'Brunei Darussalam': 'Brunei', 'Myanmar':'Burma', 'Cape Verde': 'Cabo Verde', 'Congo, (Kinshasa)': 'Congo (Kinshasa)',
'Côte d\'Ivoire': 'Cote d\'Ivoire', 'Czech Republic': 'Czechia','Hong Kong, SAR China': 'Hong Kong',
'Iran, Islamic Republic of': 'Iran','Korea (North)': 'Korea, Dem. People’s Rep.', 'Korea (South)': 'Korea, South', 'Lao PDR': 'Laos',
'Macao, SAR China': 'Macau','Micronesia, Federated States of': 'Micronesia, Fed. Sts.', 'Macedonia, Republic of': 'North Macedonia',
'Russian Federation': 'Russia', 'Saint Vincent and Grenadines': 'Saint Vincent and the Grenadines', 'Saint-Martin (French part)': 'St Martin',
'Syrian Arab Republic (Syria)': 'Syria', 'Tanzania, United Republic of': 'Tanzania', 'United States of America': 'US',
'Venezuela (Bolivarian Republic)': 'Venezuela', 'Viet Nam': 'Vietnam','Palestinian Territory': 'West Bank and Gaza'}
country_to_add = [[ 'Curacao', 'CUW'], ['Eswatini', 'SWZ'], ['Kosovo', 'XKX'], ['Sint Maarten','SXM']]
country_to_drop = ['Channel Islands']
iso_code.rename(index = country_to_rename, inplace = True)
for country in country_to_add:
iso_code.loc[country[0]] = country[1]
combined_world_df.drop(country_to_drop, axis= 0, inplace = True)
combined_world_df = combined_world_df.join(iso_code)
combined_world_df.set_index([combined_world_df.index, 'iso3'], inplace = True)
combined_world_df.rename_axis(index={None: 'country', 'iso3': 'iso_code'}, inplace = True)
Out of the 1607 metrics, I have found that a lot of them have missing entry. Hence, I have to select the interested metrics that would be included in the current study
fig, ax = plt.subplots(figsize = [6, 4])
plt.xlim(0,len(df_world))
plt.title('Number of Missing Entries in ' + str(len(combined_world_df.columns)) + ' metrics')
sns.distplot(combined_world_df.isnull().sum(), kde=False)
| Metrics | Definition |
|---|---|
| Population, total | Total Population |
| Gross Saving (%GNI) | Gross Domestic Saving (Percenrage of Gross National Income) |
| GDP per capita | GDP per capita (in current US Dollar) |
| GDP (current US\$) | Total GDP of the country (in current US Dollar) |
| Unemployment% | Percentage of unemployment population out of the total labor force |
| Rural Population% | Percentage of Rural Population |
| Current health expenditure (% of GDP)' | Current Health Expenditure out of the Total GDP |
| Current health expenditure per capita (current US$) | Current health expenditure per capita in current US dollar |
| Children Measles Immunization% | Percentage of 12-23 Month Children with Measles Immunization |
| Life Expectency | Life Expectancy at Birth |
| TB case detection% | Tuberculosis case detection rate |
| Longitude | Longitude of the Country |
| Latitude | Latitude of the Country |
| continent | The continent that the country is in |
| sub_region | The subregion that the country is in |
| infected_rate | Total Confirm Cases out of the Total Population |
| death_rate | Total Death Cases out of the Total Confirmed Cases |
# Select the selected metrics
selected_metrics = ['Population, total', 'Adjusted savings: gross savings (% of GNI)',
'GDP per capita (current US$)', 'GDP (current US$)', 'Unemployment, total (% of total labor force)',
'Rural population (% of total population)', 'Current health expenditure (% of GDP)',
'Current health expenditure per capita (current US$)', 'Immunization, measles (% of children ages 12-23 months)',
'Life expectancy at birth, total (years)', 'Tuberculosis case detection rate (%, all forms)','Long', 'Lat',
'continent', 'sub_region', 'infected_rate', 'death_rate']
print("Number of metrics selected: " + str(len(selected_metrics)))
selected_world_df = combined_world_df[selected_metrics]
# Rename the cols for convenient use
col_to_rename={'Adjusted savings: gross savings (% of GNI)': 'Gross Saving (%GNI)', 'GDP per capita (current US$)': 'GDP per capita',
'Unemployment, total (% of total labor force)': 'Unemployment%', 'Rural population (% of total population)': 'Rural Population%',
'Immunization, measles (% of children ages 12-23 months)': 'Children Measles Immunization%', 'Life expectancy at birth, total (years)': 'Life Expectancy',
'Tuberculosis case detection rate (%, all forms)': 'TB case detection%', 'Long': 'Longitude', 'Lat': 'Latitude'}
selected_world_df = selected_world_df.rename(columns = col_to_rename)
selected_world_df.head()
selected_world_df.isnull().sum()
Some data are exponential in nature. For example, The growth of infected rate is exponential because the spread of the disease is itself exponential (one infected person can spread the virus to many other people). The growth of GDP per capita is also exponential in nature. Therefore, I normalize and logarithmize the data of these variables
plot_df = selected_world_df.reset_index()
plot_df = plot_df[plot_df['infected_rate'] != 0]
plot_df['normalized log infected_rate'] = (plot_df['infected_rate'] *100).apply(math.log) - min((plot_df['infected_rate'] *100).apply(math.log))
plot_df['log GDP per capita']= plot_df['GDP per capita'].apply(lambda x: math.log(x))
# Seriation method used to show correlation between variables
def seriation(data):
L = np.diag(data.sum()) - data
eig_val, eig_vec = la.eig(L)
argL = np.argsort(eig_val)
eig_val = eig_val[argL]
eig_vec = eig_vec[:, argL]
fiedler = eig_vec[:,1]
permutation = np.argsort(fiedler)
permutation.shape
data1 = data.iloc[permutation]
data1 = data1.iloc[:, permutation]
return data1
heatmap = seriation(selected_world_df.corr())
plt.figure(figsize=(15,15))
heatmap = heatmap.apply(lambda x:abs(x))
plt.title('Correlation Between Metrics')
sns.heatmap(data = heatmap, cmap = 'Greens', annot=True, fmt=".2f")
There are several interesting insight that can be drawn from the heatmap:
note: Please note that the correlation is absolute correlation for visualization purpose
fig = go.Figure(data=go.Choropleth(
locations = selected_world_df.index.get_level_values('iso_code'),
z = selected_world_df['infected_rate'],
text = selected_world_df.index.get_level_values('country'),
colorscale = 'Reds',
marker_line_color='darkgray',
marker_line_width=0.3,
colorbar_title = 'Infected Rate',
hovertemplate =
"%{text} <br>" +
'infected_rate: %{z:.4%}</br>'
))
fig.update_layout(
title="Percentage of people infected by COVID-19 by country",
)
fig.show()
most_infected_country = selected_world_df['infected_rate'].sort_values(ascending = False)
most_infected_country = most_infected_country.iloc[:10]
most_infected_country = pd.DataFrame(most_infected_country)
most_infected_country['continent'] = selected_world_df['continent']
fig = px.bar(most_infected_country,
x=most_infected_country.index.get_level_values('country'),
y='infected_rate',
title = 'Countries with Highest Infected Rate',
color = 'continent')
fig.update_layout(
xaxis_title="Country",
yaxis_title="Infected Rate",
)
fig.show()
most_death_country = selected_world_df['death_rate'].sort_values(ascending = False)
most_death_country = most_death_country.iloc[:10]
most_death_country = pd.DataFrame(most_death_country)
most_death_country['continent'] = selected_world_df['continent']
fig = px.bar(most_death_country,
x=most_death_country.index.get_level_values('country'),
y='death_rate',
title = 'Countries with Highest Death Rate',
color = 'continent')
fig.update_layout(
xaxis_title="Country",
yaxis_title="Death Rate",
)
fig.show()
The infected rate by coountry is consistent with our common sense that Eurpoe and Noth America have the worst COVID-19 situation in terms of percentage of people infected. It would be intuitive to predict that countries with most infected rate would have the highest death rate, because the number of patients could overload the medical resources of the nation. Hence, patients would receive less healthcare and it could possibly lead to higher death rate.
However, base on the evidence shown above, none of the countries with the highest infected rate have the highest death rate. I decide to look further to investigate the relationship
plot2_df = plot_df
plot2_df['highest death rate?'] = plot2_df['country'].isin(most_death_country.index.get_level_values('country'))
fig = px.scatter(plot2_df,
x='infected_rate',
y='death_rate',
title = 'Higher Infected Rate = Higher Death Rate? Seems it is not the case...',
hover_name = 'country',
color = 'highest death rate?',
trendline = 'ols')
fig.show()
Infected rate seems to have no relationship with the death rate fo each country. However, the coutries with the most death rate tends to have lower infected rate
fig = px.violin(plot_df,
title = 'Infected Rate by continent',
y="normalized log infected_rate",
color="continent",
box=True,
points = 'all',
hover_data=['country', 'infected_rate'])
fig.show()
From the violin plot above we could see that Europe has a tighter and shorter violin plot than other continents. This could be explained by the opening border and extensive communcation between countries in Europe.
fig = px.scatter(
plot_df,
title = 'Does Higher Latitude Leads to Higher Infected Rates? ',
x = 'Latitude',
y = 'normalized log infected_rate',
color = 'continent',
hover_name = 'country',
trendline = 'ols')
fig.show()
Although the latitude seems to have negative correlation to the infected rate within each continent, continent with the highest latitude have the highest infected rate and a general negative correlation is found
fig = make_subplots(rows=2, cols=2)
columns_to_plot = [['GDP (current US$)', 'GDP per capita'], ['normalized log infected_rate', 'death_rate']]
for i in range(len(columns_to_plot)):
for j in range(len(columns_to_plot[i])):
fig.update_xaxes(title_text=(columns_to_plot[0][i] + ' - ' + columns_to_plot[1][j]), row=i+1, col=j+1)
fig.add_trace(
go.Scatter(
x=plot_df[columns_to_plot[0][i]],
y= plot_df[columns_to_plot[1][j]],
text = plot_df['country'],
mode = 'markers',),
row=i+1, col=j+1,)
fig.update_xaxes(type="log")
fig.update_layout(
title="Relationship Between GDP and infected rate/ death rate",
showlegend=False
)
fig.show()
fig= px.scatter(
plot_df,
x='log GDP per capita',
y= 'normalized log infected_rate',
hover_name = 'country',
trendline = 'ols',
color = 'continent',
title = 'Weird relationship between GDP per capita and infected rate')
fig.show()
Unfornately, high GDP per capita leads to higher infected rate. There could be several explanation for this phenomenon:
From the graph above we could observe that after controlling for urbanization population percentage, we still observe a positive correlation in each urbanization% group. Therefore, urbanization itself cannot explain the relationship between GDP per capita and infected rate
plot_df['log Current health expenditure per capita (current US$)']=plot_df['Current health expenditure per capita (current US$)'].apply(lambda x: math.log(x))
fig = make_subplots(rows=1, cols=2)
fig.add_trace(
go.Scatter(x=plot_df['log Current health expenditure per capita (current US$)'],
y=plot_df['normalized log infected_rate'],
mode = 'markers',
name = 'normalized log infected rate',
text = plot_df['country']),
row=1, col=1
)
fig.add_trace(
go.Scatter(x=plot_df['log Current health expenditure per capita (current US$)'],
y=plot_df['death_rate'],
mode = 'markers',
name = 'death rate',
text = plot_df['country']),
row=1, col=2
)
fig.update_xaxes(title_text = "Log health expenditure per capita")
fig.update_yaxes(title_font=dict(size=18, family='Courier', color='crimson'))
fig.update_layout(height=500, width=1000, title_text="Countries with more healthcare expenditure per capita have higher infected rate...")
fig.show()
Since health expenditure per capita is highly correlated to the GDP per capita, it would not be useful to visualize the low-medium-high health expenditure group(lack of variation within group). However, according to the correlation heatmap, health expenditure per capita has higher correlation with infected rate. Unfortunately, higher health expenditure per capita does not lower the death rate.
# Drop countries with missing value for visualization purpose
plot_df = plot_df[~plot_df['log Current health expenditure per capita (current US$)'].isna()]
plot_df = plot_df[~plot_df['log GDP per capita'].isna()]
plot_df = plot_df[~plot_df['normalized log infected_rate'].isna()]
fig = px.parallel_coordinates(plot_df, color='normalized log infected_rate',
dimensions=['log GDP per capita', 'log Current health expenditure per capita (current US$)', 'normalized log infected_rate'],
color_continuous_scale=px.colors.diverging.Tealrose,
title = 'Relationship between variables')
fig.show()
From the parallel graph above we could observe that Health Expenditure per capita, GDP per capita percentage have high correlation between each other and they also have good predictive power on the infected rate
selected_world_df.isna().sum()
For data modelling purpose, because of the large size of missing entry, I would have to fill the missing value with K-nearest neighbors method. Before I use this method, I would need to encode the categorical variable first before KNN imputation.
# Encode the categorical data into label before KNNimpute
encoder = LabelEncoder()
def encode(data):
nonulls = np.array(data.dropna())
impute_reshape = nonulls.reshape(-1,1)
impute_ordinal = encoder.fit_transform(impute_reshape)
data.loc[data.notnull()] = np.squeeze(impute_ordinal)
return data, encoder.classes_
def encodeDataFrame(data, cat_cols):
temp = data.copy()
cat_cols_dict = {}
for col in cat_cols:
cat_cols_dict.update({col: ''})
for column in cat_cols_dict.keys():
temp[column], cat_cols_dict[column] = encode(temp[column])
return temp, cat_cols_dict
# Convert the label back to categorical data
def decodeDataFrame(data, codebook):
for column in codebook.keys():
data[column] = data[column].apply(lambda x: int(x))
data[column] = codebook[column][data[column]]
# Encode the categorical cariable into numerical
# cat_cols = {'continent': [], 'sub_region': []}
# for column in cat_cols.keys():
# selected_world_df[column], cat_cols[column] = encode(selected_world_df[column])
# Replace the missing entry with KNN
def KNNimpute (data, n_neighbors):
imputer = KNNImputer(n_neighbors = n_neighbors)
fit_data = pd.DataFrame(imputer.fit_transform(data))
fit_data.index = data.index
fit_data.columns = data.columns
return fit_data
KNN_df, codebook = encodeDataFrame(selected_world_df, ['continent', 'sub_region'])
KNN_df = KNNimpute(KNN_df, 10)
# decodeDataFrame(KNN_df, codebook)
X = KNN_df
pca = PCA(n_components = 2)
result = pca.fit_transform(X)
fig, ax = plt.subplots()
ax.plot(result[:, 0], result[:, 1], 'o')
ax.set_title('PCA of the dataset')
plt.show()
tsne = TSNE(n_components=2, random_state=80)
X_tsne = tsne.fit_transform(X)
sne = pd.DataFrame(data = X_tsne)
sne.index = X.index
sne[['continent', 'sub_region']] = selected_world_df[['continent', 'sub_region']]
trace1 = px.scatter(
sne,
title = 'T-SNE ',
x = 0,
y = 1,
hover_name = sne.index.get_level_values('country'),
color = 'continent')
fig = go.Figure()
trace1.show()
mds = MDS(n_components=2)
X_mds = mds.fit_transform(X)
X_mds = pd.DataFrame(data = X_mds)
X_mds.index = X.index
X_mds[['continent', 'sub_region']] = selected_world_df[['continent', 'sub_region']]
fig = px.scatter(
X_mds,
title = 'MDS ',
x = 0,
y = 1,
hover_name = sne.index.get_level_values('country'),
color = 'continent')
# fig = px.scatter_3d(X_mds, x=0, y=1, z=2,
# color='continent', hover_name = sne.index.get_level_values('country'))
fig.show()
According to the dimensionality reduction results from the three methods, most countries are similar except several outlier, which are US, China, Japan and Germany. The trend of the distriution resembles to the total GDP of each country.
predict_df = KNN_df
predict_df = predict_df[predict_df['infected_rate'] != 0]
predict_df = predict_df[predict_df['death_rate'] != 0]
predict_df['normalized log infected_rate'] = (predict_df['infected_rate'] *100).apply(math.log) - min((predict_df['infected_rate'] *100).apply(math.log))
predict_df['normalized log death_rate'] = (predict_df['death_rate'] *100).apply(math.log) - min((predict_df['death_rate'] *100).apply(math.log))
predict_df['log Current health expenditure per capita'] = predict_df['Current health expenditure per capita (current US$)'].apply(math.log)
def regressionModelling(data, target, feature):
train, test = train_test_split(data, test_size=0.3, shuffle=True)
train_X = train.loc[:, feature]
train_Y = train.loc[:, target]
test_X = test.loc[:, feature]
test_Y = test.loc[:, target]
model = LinearRegression()
model.fit(train_X, train_Y)
test_pred_Y = model.predict(test_X)
test_pred_Y = pd.Series(test_pred_Y.clip(0, test_pred_Y.max()), index=test_Y.index)
test['predicted_' + target] = test_pred_Y
return test
def plotRealVSPredicted(data, real, predicted, title):
max_value = max([data[real].max(), data[predicted].max()])
fig = go.Figure()
fig.add_trace(go.Scatter(
x = data[predicted],
y = data[real],
mode = 'markers',
text = data.index.get_level_values('country'),
showlegend=False,
name = 'Predicted vs Actual'
))
fig.add_shape(type="line",
x0=0, y0=0, x1=max_value, y1=max_value,
line=dict(
color="LightSeaGreen",
width=4,
dash="dot",
)
)
fig.update_layout(
title=title,
xaxis_title="Predicted Rate",
yaxis_title="Actual Rate",
autosize=False,
width=500,
height=500,
)
fig.add_trace(go.Scatter(
x=[max_value-max_value/5],
y=[0.01],
mode="text",
text=['R2:' + ('% .3f' %r2_score(data[real], data[predicted]))],
textposition="bottom center",
showlegend=False,
))
fig.show()
ir_feature = ['log Current health expenditure per capita', 'Rural Population%', 'Latitude', 'Life Expectancy']
dr_feature = ['Unemployment%', 'Children Measles Immunization%', 'Gross Saving (%GNI)']
ir_df = regressionModelling(predict_df, 'normalized log infected_rate', ir_feature)
plotRealVSPredicted(ir_df, 'normalized log infected_rate', 'predicted_normalized log infected_rate', 'Predicting normalized log infected rate')
dr_df = regressionModelling(predict_df, 'death_rate', dr_feature)
plotRealVSPredicted(dr_df, 'death_rate', 'predicted_death_rate', 'Predicting death rate')
Using log current health expenditure, rural population percentage, latitude, and life expectancy, successfully predict the normalized log infected rate with an moderate R squared value.In this particular set of testing and training set, 49% of the variance of the infected rate in the test set can be explained by the variance of the feature variables in the training set.
In contrast, current health expenditure per capita, child mesales immunization percentage and life expectancy fails to predict the death rate of each country. It has a negative R squared value that is closed to zero, which means the variance of the feature variables fail to explain the variance of the death rate in the testing set.
Please note that the infected rate calculated in this research proposal rely on the official confirmed case firgure. Some of the countries have suspicious "zero confirmed case" figure such as North Korea, Turkmenesia, and some Oceania island countries. Therefore, it would be important to notice that the relationship drawn in this study only apply to the reported figure instead of the real figure.
Generally, countries will higher GDP per capita, higher health expenditure per capita, higher urban population, and higher latitude suffer more COVID-19 cases. In this proposal, three hypoethses were proposed but not enough evidence in this project can support the hypotheses. However, by combining some variables related to infected rate, we successfully create a model that could reliably predict the infected rate of each country.
In contrast, none of the existing variables can explain the variation of the death rate in each country.